fuente: https://github.com/DiegoKoz/discursos_presidenciales
library(glue)
Attaching package: ‘glue’
The following object is masked from ‘package:dplyr’:
collapse
df <- read_rds('data/discursos_presidenciales.rds')
df <- df %>%
mutate(texto = tolower(texto),
texto = stri_trans_general(texto, "Latin-ASCII"),
texto = str_trim(texto,side = 'both'),
texto = str_replace_all(texto,'\t',' '),
texto = str_replace_all(texto,'\n',' '),
texto = str_replace_all(texto,'\r',' '),
texto = str_replace_all(texto,'[[:punct:]]',' '),
texto = str_replace_all(texto,'\\d','NUM'),
texto = str_replace_all(texto,'(NUM)+','NUM'),
texto = str_replace_all(texto,"\\s+", " "))
palabras_comunes <- read_csv(file = 'data/r_words.txt',col_names = F)
Parsed with column specification:
cols(
X1 = [31mcol_character()[39m
)
palabras_comunes <-stri_trans_general(palabras_comunes$X1, "Latin-ASCII")
palabras_comunes <- unique(palabras_comunes)
texto <- df$texto
Corpus = VCorpus(VectorSource(texto))
Corpus = tm_map(Corpus, removeWords, c(stopwords(kind = "es"),palabras_comunes))
# Corpus <- tm_map(Corpus, stemDocument, language = "spanish") # Corpus
dtm <- DocumentTermMatrix(Corpus)
tm::nTerms(dtm)
[1] 19409
#elimino los docuemntos vacios
rowTotals <- apply(dtm , 1, sum)
nDocs(dtm)
[1] 603
dtm <- dtm[rowTotals> 0, ]
nDocs(dtm)
[1] 602
lda_fit
A LDA_Gibbs topic model with 10 topics.
Terms <- terms(lda_fit, 10)
Terms
Topic 1 Topic 2 Topic 3 Topic 4 Topic 5 Topic 6 Topic 7 Topic 8 Topic 9
[1,] "labor" "organizaciones" "presidente" "pami" "mechita" "club" "periodista" "mar" "elegir"
[2,] "pone" "medico" "paises" "tecnopolis" "julio" "competir" "justicia" "admiracion" "alumna"
[3,] "fragata" "afectando" "mundo" "declaracion" "capaces" "saladillo" "informacion" "alguno" "libremente"
[4,] "satelite" "complejas" "macri" "anteriores" "nacion" "clubes" "congreso" "vecina" "recibirnos"
[5,] "ayuden" "habian" "pais" "deporte" "enfrentamos" "anteriores" "provincias" "colectivo" "alumno"
[6,] "encabezar" "ratificando" "num" "enormemente" "enormes" "construccion" "prensa" "ensena" "podes"
[7,] "entendimiento" "tiempos" "ser" "maravilloso" "expresa" "demostrarles" "publicos" "fronteras" "deporte"
[8,] "funcionar" "trabajaba" "desarrollo" "profesionalismo" "sentirnos" "jubilados" "respecto" "colegio" "dio"
[9,] "jesus" "trabajemos" "primer" "vuelvo" "acompanarnos" "llenos" "reforma" "comunidad" "liderar"
[10,] "juicio" "cabe" "anos" "conoci" "continente" "resignar" "fiscal" "guerra" "gana"
Topic 10
[1,] "num"
[2,] "argentinos"
[3,] "pais"
[4,] "trabajo"
[5,] "anos"
[6,] "aca"
[7,] "mundo"
[8,] "ser"
[9,] "verdad"
[10,] "juntos"
Visualizacion
topicmodels_json_ldavis <- function(fitted, dtm){
svd_tsne <- function(x) tsne(svd(x)$u)
# Find required quantities
phi <- as.matrix(posterior(fitted)$terms)
theta <- as.matrix(posterior(fitted)$topics)
vocab <- colnames(phi)
term_freq <- slam::col_sums(dtm)
# Convert to json
json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
vocab = vocab,
mds.method = svd_tsne,
plot.opts = list(xlab="", ylab=""),
doc.length = as.vector(table(dtm$i)),
term.frequency = term_freq)
return(json_lda)
}
json_res <- topicmodels_json_ldavis(lda_fit, dtm)
sigma summary: Min. : 33554432 |1st Qu. : 33554432 |Median : 33554432 |Mean : 33554432 |3rd Qu. : 33554432 |Max. : 33554432 |
Epoch: Iteration #100 error is: 14.1744077047285
Epoch: Iteration #200 error is: 0.6642175441671
Epoch: Iteration #300 error is: 0.375647353159243
Epoch: Iteration #400 error is: 0.276097083111349
Epoch: Iteration #500 error is: 0.2622322268503
Epoch: Iteration #600 error is: 0.253897306321437
Epoch: Iteration #700 error is: 0.252719375064657
Epoch: Iteration #800 error is: 0.252652072110335
Epoch: Iteration #900 error is: 0.252626857162804
Epoch: Iteration #1000 error is: 0.252588385221556
serVis(json_res)
createTcpServer: address already in use
To stop the server, run servr::daemon_stop(2) or restart your R session
Serving the directory /tmp/RtmpVR2iwu/file15a266d88ad3 at http://127.0.0.1:3222